In [1]:
# Importing
import theano.tensor as T
import sys, os
sys.path.append("../GeMpy")
sys.path.append("../pygeomod")

import GeMpy_core
import Visualization

import importlib
importlib.reload(GeMpy_core)
importlib.reload(Visualization)
import numpy as np
import pandas as pn
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
np.set_printoptions(precision = 6, linewidth= 130, suppress =  True)

%matplotlib inline
#%matplotlib notebook


Using gpu device 0: GeForce GTX 970 (CNMeM is disabled, cuDNN not available)

Sandstone Model

First we make a GeMpy instance with most of the parameters default (except range that is given by the project). Then we also fix the extension and the resolution of the domain we want to interpolate. Finally we compile the function, only needed once every time we open the project (the guys of theano they are working on letting loading compiled files, even though in our case it is not a big deal).

General note. So far the reescaling factor is calculated for all series at the same time. GeoModeller does it individually for every potential field. I have to look better what this parameter exactly means


In [ ]:
# Setting extend, grid and compile
# Setting the extent
sandstone = GeMpy_core.GeMpy()

# Create Data class with raw data
sandstone.import_data( 696000,747000,6863000,6950000,-20000, 2000,
                                     path_f = os.pardir+"/input_data/a_Foliations.csv",
                                     path_i = os.pardir+"/input_data/a_Points.csv")

sandstone.Data.set_series({"EarlyGranite_Series":sandstone.Data.formations[-1], 
                      "BIF_Series":(sandstone.Data.formations[0], sandstone.Data.formations[1]),
                      "SimpleMafic_Series":sandstone.Data.formations[2]}, 
                       order = ["EarlyGranite_Series",
                              "BIF_Series",
                              "SimpleMafic_Series"])

All input data is stored in pandas dataframes under, self.Data.Interances and self.Data.Foliations:


In [22]:
sandstone.Data.Foliations;

In [23]:
# Create a class Grid so far just regular grid
sandstone.create_grid()
sandstone.Grid.grid


Out[23]:
array([[  696000.      ,  6863000.      ,   -20000.      ],
       [  696000.      ,  6863000.      ,   -19551.019531],
       [  696000.      ,  6863000.      ,   -19102.041016],
       ..., 
       [  747000.      ,  6950000.      ,     1102.040771],
       [  747000.      ,  6950000.      ,     1551.020386],
       [  747000.      ,  6950000.      ,     2000.      ]], dtype=float32)

Plotting raw data


In [19]:
sandstone.Plot.plot_data(serie = sandstone.Data.series.columns.values[1])


Class Interpolator

This class will take the data from the class Data and calculate potential fields and block


In [24]:
sandstone.set_interpolator()

Early granite


In [26]:
sandstone.Plot.plot_potential_field(10, n_pf=0)


/home/miguel/anaconda3/lib/python3.5/site-packages/matplotlib/font_manager.py:1288: UserWarning: findfont: Font family ['Times New Roman'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

BIF Series


In [31]:
sandstone.Plot.plot_potential_field(10, n_pf=1, cmap = "magma",  plot_data = False)


SImple mafic


In [29]:
sandstone.Plot.plot_potential_field(10, n_pf=2)


Optimizing the export of lithologies

Here I am going to try to return from the theano interpolate function the internal type of the result (in this case DK I guess) so I can make another function in python I guess to decide which potential field I calculate at every grid_pos


In [34]:
# Reset the block
sandstone.Interpolator.block.set_value(np.zeros_like(sandstone.Grid.grid[:,0]))

# Compute the block
sandstone.Interpolator.compute_block_model([0,1,2], verbose = 1)


4
[1 1 1 ..., 1 1 1] 0
The serie formations are EarlyGranite
[1 2]
[1 1 1 ..., 1 1 1] 10967
The serie formations are SimpleMafic2|SimpleBIF
3
[1 1 1 ..., 1 1 1] 13089
The serie formations are SimpleMafic1

In [35]:
sandstone.Plot.plot_block_section()


Export vtk


In [14]:
"""Export model to VTK

Export the geology blocks to VTK for visualisation of the entire 3-D model in an
external VTK viewer, e.g. Paraview.

..Note:: Requires pyevtk, available for free on: https://github.com/firedrakeproject/firedrake/tree/master/python/evtk

**Optional keywords**:
    - *vtk_filename* = string : filename of VTK file (default: output_name)
    - *data* = np.array : data array to export to VKT (default: entire block model)
"""
vtk_filename = "noddyFunct2"

extent_x = 10
extent_y = 10
extent_z = 10

delx = 0.2
dely = 0.2
delz = 0.2
from pyevtk.hl import gridToVTK
# Coordinates
x = np.arange(0, extent_x + 0.1*delx, delx, dtype='float64')
y = np.arange(0, extent_y + 0.1*dely, dely, dtype='float64')
z = np.arange(0, extent_z + 0.1*delz, delz, dtype='float64')

# self.block = np.swapaxes(self.block, 0, 2)
gridToVTK(vtk_filename, x, y, z, cellData = {"geology" : sol})


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-14-8308869b7d0c> in <module>()
     19 dely = 0.2
     20 delz = 0.2
---> 21 from pyevtk.hl import gridToVTK
     22 # Coordinates
     23 x = np.arange(0, extent_x + 0.1*delx, delx, dtype='float64')

ImportError: No module named 'pyevtk'

Performance Analysis

CPU


In [32]:
%%timeit
sol =  interpolator.geoMigueller(dips,dips_angles,azimuths,polarity, rest, ref)[0]


10 loops, best of 3: 46.8 ms per loop

In [172]:
sandstone.block_export.profile.summary()


Function profiling
==================
  Message: /home/bl3/PycharmProjects/GeMpy/GeMpy/GeoMig.py:778
  Time in 12 calls to Function.__call__: 1.747251e+01s
  Time in Function.fn.__call__: 1.746418e+01s (99.952%)
  Time in thunks: 1.695012e+01s (97.010%)
  Total compile time: 8.084836e+00s
    Number of Apply nodes: 358
    Theano Optimizer time: 7.368957e+00s
       Theano validate time: 3.972366e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 4.228358e-01s
       Import time 0.000000e+00s

Time in all call to theano.grad() 0.000000e+00s
Time since theano import 4116.375s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  51.6%    51.6%       8.746s       6.57e-03s     C     1332     111   theano.tensor.elemwise.Elemwise
  21.5%    73.1%       3.647s       2.76e-02s     C      132      11   theano.tensor.basic.Alloc
   9.1%    82.2%       1.545s       4.29e-02s     Py      36       3   theano.tensor.basic.Nonzero
   6.9%    89.1%       1.170s       8.12e-03s     C      144      12   theano.tensor.basic.Join
   6.8%    95.9%       1.153s       9.61e-03s     C      120      10   theano.tensor.blas.Dot22Scalar
   3.3%    99.2%       0.552s       5.75e-03s     C       96       8   theano.tensor.elemwise.Sum
   0.5%    99.7%       0.080s       3.35e-03s     C       24       2   theano.tensor.subtensor.AdvancedSubtensor1
   0.3%    99.9%       0.043s       3.58e-03s     C       12       1   theano.tensor.subtensor.AdvancedIncSubtensor1
   0.0%    99.9%       0.005s       4.09e-04s     Py      12       1   theano.tensor.nlinalg.MatrixInverse
   0.0%   100.0%       0.002s       1.75e-04s     Py      12       1   theano.scan_module.scan_op.Scan
   0.0%   100.0%       0.002s       1.66e-04s     C       12       1   theano.compile.ops.DeepCopyOp
   0.0%   100.0%       0.001s       3.25e-06s     C      300      25   theano.tensor.subtensor.IncSubtensor
   0.0%   100.0%       0.001s       2.46e-06s     C      384      32   theano.tensor.basic.Reshape
   0.0%   100.0%       0.001s       1.18e-06s     C      780      65   theano.tensor.elemwise.DimShuffle
   0.0%   100.0%       0.001s       2.84e-06s     C      324      27   theano.tensor.subtensor.Subtensor
   0.0%   100.0%       0.001s       4.49e-05s     Py      12       1   theano.tensor.extra_ops.FillDiagonal
   0.0%   100.0%       0.000s       1.88e-06s     C      204      17   theano.tensor.opt.MakeVector
   0.0%   100.0%       0.000s       9.55e-07s     C      216      18   theano.tensor.basic.ScalarFromTensor
   0.0%   100.0%       0.000s       1.45e-06s     C      108       9   theano.compile.ops.Shape_i
   0.0%   100.0%       0.000s       1.15e-05s     C       12       1   theano.tensor.blas_c.CGemv
   ... (remaining 2 Classes account for   0.00%(0.00s) of the runtime)

Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  21.5%    21.5%       3.647s       2.76e-02s     C      132       11   Alloc
  17.9%    39.5%       3.040s       2.53e-01s     C       12        1   Elemwise{Composite{(i0 * i1 * LT(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4), i5) * (((i6 + ((i7 * Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4)) / i8)) - ((i9 * Composite{(sqr(i0) * i0)}(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4))) / i10)) + ((i11 * Composite{(sqr(sqr(i0)) * i0)}(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4))) / i12)))}}[(0, 1)]
  16.1%    55.6%       2.736s       2.28e-01s     C       12        1   Elemwise{Composite{(i0 * ((LT(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3), i4) * ((i5 + (i6 * Composite{(sqr(i0) * i0)}((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4))) + (i7 * Composite{((sqr(sqr(i0)) * sqr(i0)) * i0)}((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4)))) - ((i8 * sqr((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4))) + (i9 * Composite{(sqr(sqr(i0)
  12.4%    68.0%       2.102s       1.17e-02s     C      180       15   Elemwise{sub,no_inplace}
   9.1%    77.1%       1.545s       4.29e-02s     Py      36        3   Nonzero
   6.9%    84.0%       1.170s       8.12e-03s     C      144       12   Join
   6.8%    90.8%       1.153s       9.61e-03s     C      120       10   Dot22Scalar
   3.6%    94.4%       0.602s       7.17e-03s     C       84        7   Elemwise{mul,no_inplace}
   3.2%    97.5%       0.536s       1.12e-02s     C       48        4   Sum{axis=[0], acc_dtype=float64}
   1.0%    98.5%       0.165s       6.86e-03s     C       24        2   Elemwise{Mul}[(0, 1)]
   0.5%    99.0%       0.080s       3.35e-03s     C       24        2   AdvancedSubtensor1
   0.4%    99.4%       0.067s       5.62e-03s     C       12        1   Elemwise{Composite{((i0 / i1) + ((i2 * i3) / i1) + ((i4 * i5 * i6) / i7))}}[(0, 0)]
   0.3%    99.6%       0.043s       3.58e-03s     C       12        1   AdvancedIncSubtensor1{no_inplace,set}
   0.1%    99.7%       0.017s       1.40e-03s     C       12        1   Elemwise{Composite{(i0 * LE(i1, i2) * GE(i1, i3))}}
   0.1%    99.8%       0.015s       3.22e-04s     C       48        4   Sum{axis=[1], acc_dtype=float64}
   0.1%    99.9%       0.010s       2.91e-04s     C       36        3   Elemwise{Sqr}[(0, 0)]
   0.0%    99.9%       0.005s       4.09e-04s     Py      12        1   MatrixInverse
   0.0%    99.9%       0.003s       2.32e-04s     C       12        1   Elemwise{Cast{int8}}
   0.0%    99.9%       0.002s       1.75e-04s     Py      12        1   for{cpu,scan_fn}
   0.0%    99.9%       0.002s       1.66e-04s     C       12        1   DeepCopyOp
   ... (remaining 79 Ops account for   0.06%(0.01s) of the runtime)

Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  20.2%    20.2%       3.425s       2.85e-01s     12   331   Alloc(CGemv{inplace}.0, Elemwise{Composite{((i0 // i1) + i2)}}[(0, 0)].0, TensorConstant{1}, TensorConstant{1}, Elemwise{add,no_inplace}.0)
  17.9%    38.1%       3.040s       2.53e-01s     12   339   Elemwise{Composite{(i0 * i1 * LT(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4), i5) * (((i6 + ((i7 * Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4)) / i8)) - ((i9 * Composite{(sqr(i0) * i0)}(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4))) / i10)) + ((i11 * Composite{(sqr(sqr(i0)) * i0)}(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4))) / i12)))}}[(0, 1)](Subtensor{:int64:}.0, Join.0, Reshape{2
  16.1%    54.3%       2.736s       2.28e-01s     12   340   Elemwise{Composite{(i0 * ((LT(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3), i4) * ((i5 + (i6 * Composite{(sqr(i0) * i0)}((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4))) + (i7 * Composite{((sqr(sqr(i0)) * sqr(i0)) * i0)}((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4)))) - ((i8 * sqr((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4))) + (i9 * Composite{(sqr(sqr(i0)) * i0)}((C
   8.1%    62.4%       1.374s       1.14e-01s     12   134   Nonzero(Reshape{1}.0)
   8.0%    70.4%       1.361s       1.13e-01s     12   290   Elemwise{sub,no_inplace}(InplaceDimShuffle{0,x}.0, InplaceDimShuffle{1,0}.0)
   6.6%    77.0%       1.123s       9.36e-02s     12   304   Join(TensorConstant{0}, Elemwise{sub,no_inplace}.0, Elemwise{sub,no_inplace}.0, Elemwise{sub,no_inplace}.0)
   3.7%    80.8%       0.632s       5.27e-02s     12   264   Dot22Scalar(Reshape{2}.0, InplaceDimShuffle{1,0}.0, TensorConstant{2.0})
   3.4%    84.1%       0.570s       4.75e-02s     12    42   Elemwise{mul,no_inplace}(universal matrix, InplaceDimShuffle{x,0}.0)
   2.3%    86.4%       0.382s       3.18e-02s     12   292   Elemwise{sub,no_inplace}(InplaceDimShuffle{0,x}.0, InplaceDimShuffle{1,0}.0)
   2.1%    88.5%       0.359s       2.99e-02s     12   291   Elemwise{sub,no_inplace}(InplaceDimShuffle{0,x}.0, InplaceDimShuffle{1,0}.0)
   2.0%    90.5%       0.346s       2.89e-02s     12   342   Sum{axis=[0], acc_dtype=float64}(Elemwise{Composite{(i0 * i1 * LT(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4), i5) * (((i6 + ((i7 * Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4)) / i8)) - ((i9 * Composite{(sqr(i0) * i0)}(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4))) / i10)) + ((i11 * Composite{(sqr(sqr(i0)) * i0)}(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i2, i3, i4))) / i12)))}}[(0, 1)].0)
   1.6%    92.2%       0.273s       2.27e-02s     12   265   Dot22Scalar(Rest of the points of the layers, InplaceDimShuffle{1,0}.0, TensorConstant{2.0})
   1.5%    93.6%       0.247s       2.06e-02s     12   266   Dot22Scalar(Reference points for every layer, InplaceDimShuffle{1,0}.0, TensorConstant{2.0})
   1.3%    94.9%       0.222s       1.85e-02s     12   240   Alloc(Subtensor{:int64:}.0, Elemwise{Composite{((i0 // i1) + i2)}}[(0, 0)].0, TensorConstant{1}, TensorConstant{1}, Elemwise{Composite{Switch(LT(i0, i1), Switch(LT((i2 + i0), i1), i1, (i2 + i0)), Switch(LT(i0, i2), i0, i2))}}.0)
   1.0%    95.9%       0.165s       1.37e-02s     12   338   Elemwise{Mul}[(0, 1)](Subtensor{int64::}.0, InplaceDimShuffle{1,0}.0, Subtensor{:int64:}.0)
   0.9%    96.8%       0.151s       1.26e-02s     12   343   Sum{axis=[0], acc_dtype=float64}(Elemwise{Composite{(i0 * ((LT(Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3), i4) * ((i5 + (i6 * Composite{(sqr(i0) * i0)}((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4))) + (i7 * Composite{((sqr(sqr(i0)) * sqr(i0)) * i0)}((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4)))) - ((i8 * sqr((Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}(i1, i2, i3) / i4))) + (i9 * 
   0.8%    97.6%       0.136s       1.13e-02s     12   179   Nonzero(Reshape{1}.0)
   0.4%    98.0%       0.067s       5.62e-03s     12   344   Elemwise{Composite{((i0 / i1) + ((i2 * i3) / i1) + ((i4 * i5 * i6) / i7))}}[(0, 0)](Sum{axis=[0], acc_dtype=float64}.0, InplaceDimShuffle{x}.0, InplaceDimShuffle{x}.0, Sum{axis=[0], acc_dtype=float64}.0, TensorConstant{(1,) of -1.0}, InplaceDimShuffle{x}.0, Sum{axis=[0], acc_dtype=float64}.0, InplaceDimShuffle{x}.0)
   0.4%    98.3%       0.060s       4.96e-03s     12   180   AdvancedSubtensor1(Reshape{1}.0, Subtensor{int64}.0)
   0.3%    98.6%       0.043s       3.58e-03s     12   356   AdvancedIncSubtensor1{no_inplace,set}(<TensorType(float32, vector)>, Sum{axis=[0], acc_dtype=float64}.0, Subtensor{int64}.0)
   ... (remaining 338 Apply instances account for 1.42%(0.24s) of the runtime)

Here are tips to potentially make your code run faster
                 (if you think of new ones, suggest them on the mailing list).
                 Test them first, as they are not guaranteed to always provide a speedup.
  - Try the Theano flag floatX=float32
We don't know if amdlibm will accelerate this scalar op. deg2rad
We don't know if amdlibm will accelerate this scalar op. deg2rad
  - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speeds up only some Elemwise operation.

GPU


In [20]:
%%timeit
# Reset the block
sandstone.block.set_value(np.zeros_like(sandstone.grid[:,0]))

# Compute the block
sandstone.compute_block_model([0,1,2], verbose = 0)


1 loop, best of 3: 1.51 s per loop

In [22]:
sandstone.block_export.profile.summary()


Function profiling
==================
  Message: /home/miguel/PycharmProjects/GeMpy/GeMpy/GeoMig.py:791
  Time in 15 calls to Function.__call__: 7.831817e+00s
  Time in Function.fn.__call__: 7.828252e+00s (99.954%)
  Time in thunks: 7.533441e+00s (96.190%)
  Total compile time: 1.917513e+01s
    Number of Apply nodes: 468
    Theano Optimizer time: 4.435555e+00s
       Theano validate time: 4.534802e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 1.452507e+01s
       Import time 3.308706e-01s

Time in all call to theano.grad() 0.000000e+00s
Time since theano import 1392.894s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  69.2%    69.2%       5.213s       3.48e-01s     C       15       1   theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1
  13.1%    82.3%       0.988s       7.49e-04s     C     1320      88   theano.tensor.elemwise.Elemwise
   4.3%    86.6%       0.322s       2.15e-03s     C      150      10   theano.tensor.blas.Dot22Scalar
   3.2%    89.8%       0.238s       5.30e-03s     Py      45       3   theano.tensor.basic.Nonzero
   3.0%    92.7%       0.223s       4.95e-04s     C      450      30   theano.sandbox.cuda.basic_ops.HostFromGpu
   2.6%    95.4%       0.199s       1.90e-03s     C      105       7   theano.tensor.elemwise.Sum
   1.7%    97.1%       0.131s       3.81e-04s     C      345      23   theano.sandbox.cuda.basic_ops.GpuFromHost
   1.1%    98.3%       0.086s       2.88e-03s     C       30       2   theano.sandbox.cuda.basic_ops.GpuAdvancedSubtensor1
   0.5%    98.8%       0.041s       3.36e-05s     C     1230      82   theano.sandbox.cuda.basic_ops.GpuElemwise
   0.4%    99.2%       0.028s       2.63e-04s     C      105       7   theano.sandbox.cuda.basic_ops.GpuAlloc
   0.3%    99.5%       0.024s       1.63e-03s     Py      15       1   theano.tensor.nlinalg.MatrixInverse
   0.2%    99.7%       0.016s       1.21e-04s     C      135       9   theano.sandbox.cuda.basic_ops.GpuJoin
   0.2%    99.9%       0.012s       7.98e-04s     Py      15       1   theano.scan_module.scan_op.Scan
   0.0%    99.9%       0.004s       1.09e-05s     C      345      23   theano.sandbox.cuda.basic_ops.GpuReshape
   0.0%    99.9%       0.002s       1.01e-04s     C       15       1   theano.sandbox.cuda.basic_ops.GpuCAReduce
   0.0%    99.9%       0.001s       2.67e-06s     C      255      17   theano.sandbox.cuda.basic_ops.GpuSubtensor
   0.0%   100.0%       0.001s       1.85e-06s     C      360      24   theano.tensor.subtensor.IncSubtensor
   0.0%   100.0%       0.001s       9.25e-07s     C      645      43   theano.sandbox.cuda.basic_ops.GpuDimShuffle
   0.0%   100.0%       0.000s       3.00e-05s     Py      15       1   theano.tensor.extra_ops.FillDiagonal
   0.0%   100.0%       0.000s       8.01e-06s     C       45       3   theano.tensor.basic.Join
   ... (remaining 11 Classes account for   0.03%(0.00s) of the runtime)

Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  69.2%    69.2%       5.213s       3.48e-01s     C       15        1   GpuAdvancedIncSubtensor1{inplace,set}
  12.1%    81.3%       0.911s       6.08e-03s     C      150       10   Elemwise{Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}}
   4.3%    85.6%       0.322s       2.15e-03s     C      150       10   Dot22Scalar
   3.2%    88.7%       0.238s       5.30e-03s     Py      45        3   Nonzero
   3.0%    91.7%       0.223s       4.95e-04s     C      450       30   HostFromGpu
   2.4%    94.1%       0.179s       5.98e-03s     C       30        2   Sum{axis=[0], acc_dtype=float64}
   1.7%    95.8%       0.131s       3.81e-04s     C      345       23   GpuFromHost
   1.1%    97.0%       0.086s       2.88e-03s     C       30        2   GpuAdvancedSubtensor1
   0.6%    97.5%       0.042s       4.03e-04s     C      105        7   Elemwise{Composite{Identity((i0 / i1))}}[(0, 0)]
   0.4%    97.9%       0.027s       3.03e-04s     C       90        6   GpuAlloc
   0.3%    98.2%       0.024s       1.63e-03s     Py      15        1   MatrixInverse
   0.2%    98.4%       0.016s       1.09e-03s     C       15        1   Elemwise{Composite{(i0 + ((i1 * i2) / i3) + ((i4 * i1 * i5) / i6))}}[(0, 2)]
   0.2%    98.6%       0.016s       1.21e-04s     C      135        9   GpuJoin
   0.2%    98.8%       0.015s       9.72e-04s     C       15        1   Sum{axis=[0], acc_dtype=float64}
   0.2%    99.0%       0.012s       7.98e-04s     Py      15        1   for{cpu,scan_fn}
   0.1%    99.1%       0.010s       9.39e-05s     C      105        7   GpuElemwise{Composite{Cast{float32}(LT(i0, i1))},no_inplace}
   0.1%    99.2%       0.007s       3.49e-05s     C      210       14   GpuElemwise{sub,no_inplace}
   0.1%    99.3%       0.007s       7.58e-05s     C       90        6   GpuElemwise{Composite{(i0 * (i1 ** i2))},no_inplace}
   0.1%    99.4%       0.007s       6.48e-05s     C      105        7   GpuElemwise{mul,no_inplace}
   0.1%    99.5%       0.006s       9.45e-05s     C       60        4   Elemwise{Cast{float64}}
   ... (remaining 116 Ops account for   0.52%(0.04s) of the runtime)

Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  69.2%    69.2%       5.213s       3.48e-01s     15   467   GpuAdvancedIncSubtensor1{inplace,set}(<CudaNdarrayType(float32, vector)>, GpuCAReduce{add}{1,0}.0, Subtensor{int64}.0)
   6.0%    75.2%       0.453s       3.02e-02s     15   344   Elemwise{Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}}(Reshape{2}.0, Reshape{2}.0, Dot22Scalar.0)
   3.4%    78.6%       0.257s       1.72e-02s     15   342   Elemwise{Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}}(Reshape{2}.0, Reshape{2}.0, Dot22Scalar.0)
   2.7%    81.3%       0.200s       1.34e-02s     15   343   Elemwise{Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}}(Reshape{2}.0, Reshape{2}.0, Dot22Scalar.0)
   2.5%    83.8%       0.186s       1.24e-02s     15   306   Dot22Scalar(Elemwise{Cast{float64}}.0, InplaceDimShuffle{1,0}.0, TensorConstant{2.0})
   2.4%    86.1%       0.179s       1.19e-02s     15   181   Nonzero(HostFromGpu.0)
   1.9%    88.0%       0.144s       9.62e-03s     15   436   Sum{axis=[0], acc_dtype=float64}(HostFromGpu.0)
   1.7%    89.8%       0.131s       8.71e-03s     15   433   HostFromGpu(GpuElemwise{Composite{(i0 * i1 * i2 * (((i3 + (i4 / i5)) - ((i6 * i7) / i8)) + ((i9 * i10) / i11)))}}[(0, 1)].0)
   1.1%    90.9%       0.086s       5.75e-03s     15   305   Dot22Scalar(Elemwise{Cast{float64}}.0, InplaceDimShuffle{1,0}.0, TensorConstant{2.0})
   0.8%    91.8%       0.064s       4.26e-03s     15   227   GpuAdvancedSubtensor1(GpuReshape{1}.0, Subtensor{int64}.0)
   0.7%    92.5%       0.053s       3.50e-03s     15   355   GpuFromHost(Elemwise{Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}}.0)
   0.6%    93.1%       0.049s       3.24e-03s     15   304   Dot22Scalar(Elemwise{Cast{float64}}.0, InplaceDimShuffle{1,0}.0, TensorConstant{2.0})
   0.6%    93.7%       0.046s       3.05e-03s     15   204   Nonzero(HostFromGpu.0)
   0.6%    94.3%       0.043s       2.89e-03s     15   434   HostFromGpu(GpuElemwise{Composite{(i0 * ((i1 * ((i2 + i3 + i4) - (i5 + i6))) - (i7 * ((i2 + i8 + i9) - (i10 + i11)))))}}[(0, 0)].0)
   0.6%    94.9%       0.042s       2.82e-03s     15   435   Elemwise{Composite{Identity((i0 / i1))}}[(0, 0)](HostFromGpu.0, InplaceDimShuffle{x,x}.0)
   0.5%    95.3%       0.036s       2.39e-03s     15   353   GpuFromHost(Elemwise{Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}}.0)
   0.5%    95.8%       0.035s       2.34e-03s     15   437   Sum{axis=[0], acc_dtype=float64}(HostFromGpu.0)
   0.4%    96.2%       0.028s       1.86e-03s     15   354   GpuFromHost(Elemwise{Composite{Cast{float32}(sqrt(((i0 + i1) - i2)))}}.0)
   0.3%    96.5%       0.024s       1.63e-03s     15   419   MatrixInverse(IncSubtensor{InplaceSet;int64::, int64:int64:}.0)
   0.3%    96.8%       0.023s       1.50e-03s     15   241   GpuAdvancedSubtensor1(GpuReshape{1}.0, Subtensor{int64}.0)
   ... (remaining 448 Apply instances account for 3.20%(0.24s) of the runtime)

Here are tips to potentially make your code run faster
                 (if you think of new ones, suggest them on the mailing list).
                 Test them first, as they are not guaranteed to always provide a speedup.
  - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speeds up only some Elemwise operation.

In [ ]: